suppressPackageStartupMessages(library(dplyr))
library(readr)
suppressPackageStartupMessages(library(lubridate))
library(ggplot2)
wea <- read_csv('/Users/Andy/Google Drive/boco-jail/downtown-boulder-weather.csv',col_types = cols())
glimpse(wea)
Observations: 6,400
Variables: 12
$ DATE <int> 20000101, 20000102, 20000103, 20000104, 20000105, 20000106, 20000107, 20000...
$ PRCP <dbl> 0.00, 0.00, 0.08, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.0...
$ SNOW <dbl> 0.0, 0.0, 2.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
$ SNWD <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ TMAX <dbl> 54, 40, 36, 49, 47, 42, 47, 50, 45, 42, 61, 58, 41, 64, 63, 49, 64, 51, 56,...
$ TMIN <dbl> 29, 22, 19, 13, 26, 16, 19, 23, 29, 30, 17, 41, 25, 25, 34, 22, 30, 35, 32,...
$ WT01 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ WT03 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ WT04 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ WT05 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ WT06 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ WT11 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
summary(wea)
DATE PRCP SNOW SNWD TMAX
Min. :20000101 Min. :0.00000 Min. : 0.0000 Length:6400 Min. : 6.00
1st Qu.:20040519 1st Qu.:0.00000 1st Qu.: 0.0000 Class :character 1st Qu.: 53.00
Median :20081004 Median :0.00000 Median : 0.0000 Mode :character Median : 67.00
Mean :20083356 Mean :0.05655 Mean : 0.2372 Mean : 65.99
3rd Qu.:20130222 3rd Qu.:0.01000 3rd Qu.: 0.0000 3rd Qu.: 81.00
Max. :20170714 Max. :9.08000 Max. :16.7000 Max. :102.00
NA's :5 NA's :1190 NA's :1
TMIN WT01 WT03 WT04 WT05
Min. :-17.00 Min. :1 Min. :1 Length:6400 Min. :1
1st Qu.: 27.00 1st Qu.:1 1st Qu.:1 Class :character 1st Qu.:1
Median : 38.00 Median :1 Median :1 Mode :character Median :1
Mean : 38.17 Mean :1 Mean :1 Mean :1
3rd Qu.: 51.00 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1
Max. : 77.00 Max. :1 Max. :1 Max. :1
NA's :3 NA's :6304 NA's :5623 NA's :6337
WT06 WT11
Min. :1 Min. :1
1st Qu.:1 1st Qu.:1
Median :1 Median :1
Mean :1 Mean :1
3rd Qu.:1 3rd Qu.:1
Max. :1 Max. :1
NA's :6370 NA's :6331
names(wea) <- tolower(names(wea))
wea$date <- lubridate::ymd(wea$date)
wea <- wea %>% select( date,prcp,snow,tmax,tmin)
head(wea)
wea %>%
ggplot(aes(date,tmax))+
geom_point() +
ylab('Max Temp') +
ggtitle('Downtown Boulder Weather')
wea %>%
ggplot(aes(date,prcp))+
geom_point() +
ylim(0,3)
wea %>%
ggplot(aes(date,snow))+
geom_point() +
ylim(0,3)
Some info on the bookings-with-transient-status.csv data: - fta is “Failure to Appear” - ftc is “Failure to Comply” - (Sam) These are bookings in Boulder County Jail only. - (Sam) Each row is an individual booking. - (Sam) boulder means the arrest was made by Boulder PD - (Sam) Column transient for whether someone was homeless or not. - (Sam) I created indicator variables for a variety of antihomeless charges (that a local law school came up with). antihomeless is true if any of the antihomeless charges existed in the booking. So one source of error could be that if someone was arrested for an antihomeless charge AND a more serious offense, I still mark it as “antihomeless”. - (Sam) I don’t remember making any_antihomeless – I suspect it’s identical to antihomeless and introduced by accident - (Andy) There are two Booking Time columns? I will use booking_time for now, as it seems better. - (Andy) What are locations? Is that where they were booked? - (Andy) Booked and booking_time appear to be duplicates?
bk <- read_csv('/Users/Andy/Google Drive/boco-jail/bookings-with-transient-status.csv',col_types = cols())
number of columns of result is not a multiple of vector length (arg 1)46578 parsing failures.
row # A tibble: 5 x 5 col row col expected actual expected <int> <chr> <chr> <chr> actual 1 1537 Case No no trailing characters .0 file 2 1539 Case No no trailing characters .0 row 3 1540 Case No no trailing characters .0 col 4 1541 Case No no trailing characters .0 expected 5 1542 Case No no trailing characters .0 actual # ... with 1 more variables: file <chr>
... ................. ... ............................................. ........ ............................................. ...... ............................................. .... ............................................. ... ............................................. ... ............................................. ........ ............................................. ...... .......................................
See problems(...) for more details.
glimpse(bk)
Observations: 167,633
Variables: 30
$ Name <chr> "HOOD,AARON JAY", "LAWYER,KENNETH A", "AGUILAR TORRES,MIGUE...
$ Booked <dttm> 2000-01-01 02:12:00, 2000-01-01 04:01:00, 2000-01-01 03:11...
$ Location <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ DOB <date> 1975-10-08, 1958-09-29, 1972-11-14, 1953-06-07, 1961-09-09...
$ Race <chr> "W", "W", "W", "W", "W", "W", "W", "W", "W", "W", "W", "W",...
$ Sex <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "F", "M",...
$ Case No <int> 991126052, 991001313, 991126053, 991031932, 991102428, 9911...
$ Arresting Agency <chr> "UNIVERSITY OF COLORADO", "JAIL MITTS ONLY", "UNIVERSITY OF...
$ Arrest Date <date> 1999-12-31, 2000-01-16, 1999-12-31, 1999-12-31, 1999-12-31...
$ camping <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ fta <chr> "True", "False", "False", "False", "False", "False", "False...
$ ftc <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ booking_time <dttm> 2000-01-01 02:12:00, 2000-01-01 04:01:00, 2000-01-01 03:11...
$ boulder <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ urination <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ vehicle_as_residence <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ public_obstruct <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ public_trespass <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ begging <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ antihomeless <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ smoking <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ any_antihomeless <chr> "False", "False", "False", "False", "False", "False", "Fals...
$ Address <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ City <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ State <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ ZIP Code <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ Booking Date <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ Booking Time <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ Facility <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ transient <chr> "False", "False", "False", "False", "False", "False", "Fals...
# 'Booking Time' appears to be a duplicate/bad column
bk <- bk %>% select(-`Booking Time`)
# make columns names lowercase and remove spaces
names(bk) <- tolower(names(bk))
names(bk) <- gsub(' ','_',names(bk))
# some vars make more sense as factors
bk$arresting_agency <- as.factor(bk$arresting_agency)
bk$location <- as.factor(bk$location)
bk$race <- as.factor(bk$race)
# add wkday,month,year so we can aggregate by those
bk$wkday <- lubridate::wday(bk$booking_date,label=TRUE)
bk$month_ <- lubridate::month(bk$arrest_date,label=TRUE)
bk$year <- lubridate::year(bk$arrest_date)
# modern reporting seems to start in 2000 (yearly totals go from less 100 before 2000, to order of 10,000 starting w/ 2000). Keep only 2000 on here.
bk <- bk %>% filter(year>=2000)
bk$booked_date <- lubridate::date(bk$booked)
# a bunch of vars should be logical
to_log <- function(a_col){
as.logical(a_col)
}
cols_to_log <- c('camping','boulder','urination','vehicle_as_residence','public_obstruct','public_trespass','begging','antihomeless','smoking','any_antihomeless','transient','fta','ftc')
bk[cols_to_log] <- lapply(bk[cols_to_log],to_log)
glimpse(bk)
Observations: 163,939
Variables: 33
$ name <chr> "LAWYER,KENNETH A", "COPELAND,MARK WILLIAM", "DITZEL,HOWARD...
$ booked <dttm> 2000-01-01 04:01:00, 2000-01-01 03:39:00, 2000-01-01 06:00...
$ location <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ dob <date> 1958-09-29, 1953-01-27, 1948-01-25, 1977-03-15, 1980-06-22...
$ race <fctr> W, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W...
$ sex <chr> "M", "M", "M", "F", "M", "M", "M", "F", "M", "M", "M", "M",...
$ case_no <int> 991001313, 1089421, 1031978, 1103574, 5, 1101849, 1089420, ...
$ arresting_agency <fctr> JAIL MITTS ONLY, LAFAYETTE PD, BOULDER PD, LONGMONT PD, BO...
$ arrest_date <date> 2000-01-16, 2000-01-01, 2000-01-01, 2000-01-01, 2000-01-01...
$ camping <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ fta <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
$ ftc <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ booking_time <dttm> 2000-01-01 04:01:00, 2000-01-01 03:39:00, 2000-01-01 06:00...
$ boulder <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ urination <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ vehicle_as_residence <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ public_obstruct <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ public_trespass <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ begging <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ antihomeless <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ smoking <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ any_antihomeless <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ address <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ city <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ state <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ zip_code <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ booking_date <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ facility <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ transient <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ wkday <ord> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ month_ <ord> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan,...
$ year <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,...
$ booked_date <date> 2000-01-01, 2000-01-01, 2000-01-01, 2000-01-01, 2000-01-01...
summary(ages)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.0 22.0 29.0 31.9 40.0 133.0
mean(bk$transient,na.rm = TRUE)*100
[1] 13.05669
mean(bk$boulder)*100
[1] 6.589646
levels(bk$arresting_agency)
[1] "BOULDER COUNTY DRUG TASK FORCE" "BOULDER COUNTY SHERIFFS OFFICE"
[3] "BOULDER PD" "COLORADO STATE PATROL"
[5] "COMMUNITY CORRECTIONS" "DISTRICT ATTORNEYS OFFICE"
[7] "ERIE PD" "JAIL MITTS ONLY"
[9] "LAFAYETTE PD" "LONGMONT PD"
[11] "LOUISVILLE PD" "NEDERLAND MARSHALS OFFICE"
[13] "OTHER" "PAROLE"
[15] "STATE DIVISION OF WILDLIFE" "UNIVERSITY OF COLORADO"
[17] "WARD MARSHALS OFFICE"
bk %>%
group_by(arresting_agency) %>%
tally() %>%
arrange( desc(n) ) %>%
ggplot(aes(x=reorder(arresting_agency,n),y=n))+
geom_bar(stat='identity',aes(fill=arresting_agency)) +
coord_flip()
bk %>% group_by(race) %>%
tally() %>%
arrange(desc(n)) %>%
ggplot(aes(x=reorder(race,n),y=n))+
geom_bar(stat='identity',aes(fill=race)) +
coord_flip()
bk %>%
filter(year %in% c(2011,2012,2013,2015,2015)) %>%
group_by(year,month_) %>%
tally() %>%
ggplot(aes(month_,n))+
geom_point() +
geom_bar(stat='identity',aes(fill=month_)) +
facet_wrap(~year)
bk %>%
filter(!is.na(wkday)) %>%
group_by(wkday) %>%
tally() %>%
ggplot( aes(wkday,n)) +
geom_col(aes(fill=wkday)) +
ggtitle('Total Arrests By Day, for ALL data')
bk %>%
filter(!is.na(wkday)) %>%
filter(year>1999) %>%
group_by(year,wkday) %>%
tally() %>%
ggplot( aes(wkday,n)) +
geom_col(aes(fill=wkday)) +
facet_wrap(~year) +
ggtitle('Total Arrests By Day, for each year')
bk %>%
filter(!is.na(wkday)) %>%
filter(year>1999) %>%
group_by(month_,wkday) %>%
tally() %>%
ggplot( aes(wkday,n)) +
geom_col(aes(fill=wkday)) +
facet_wrap(~month_) +
ggtitle('Total Arrests By Day, for each month, includes all years')
bk %>%
filter(arrest_date>"2000-01-01") %>%
group_by(arrest_date) %>%
tally() %>%
ggplot(aes(arrest_date,n)) +
geom_point(alpha=0.2) +
ylim(0,60) +
geom_smooth(method="lm")
NA
bk_wea <- left_join(bk,wea,by=c('arrest_date'='date'))
glimpse(bk_wea)
Observations: 163,939
Variables: 37
$ name <chr> "LAWYER,KENNETH A", "COPELAND,MARK WILLIAM", "DITZEL,HOWARD...
$ booked <dttm> 2000-01-01 04:01:00, 2000-01-01 03:39:00, 2000-01-01 06:00...
$ location <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ dob <date> 1958-09-29, 1953-01-27, 1948-01-25, 1977-03-15, 1980-06-22...
$ race <chr> "W", "W", "W", "W", "W", "W", "W", "W", "W", "W", "W", "W",...
$ sex <chr> "M", "M", "M", "F", "M", "M", "M", "F", "M", "M", "M", "M",...
$ case_no <int> 991001313, 1089421, 1031978, 1103574, 5, 1101849, 1089420, ...
$ arresting_agency <fctr> JAIL MITTS ONLY, LAFAYETTE PD, BOULDER PD, LONGMONT PD, BO...
$ arrest_date <date> 2000-01-16, 2000-01-01, 2000-01-01, 2000-01-01, 2000-01-01...
$ camping <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ fta <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
$ ftc <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ booking_time <dttm> 2000-01-01 04:01:00, 2000-01-01 03:39:00, 2000-01-01 06:00...
$ boulder <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ urination <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ vehicle_as_residence <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ public_obstruct <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ public_trespass <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ begging <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ antihomeless <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ smoking <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ any_antihomeless <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ address <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ city <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ state <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ zip_code <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ booking_date <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ facility <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ transient <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ wkday <ord> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ month_ <ord> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan,...
$ year <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,...
$ booked_date <date> 2000-01-01, 2000-01-01, 2000-01-01, 2000-01-01, 2000-01-01...
$ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ snow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ tmax <dbl> 49, 54, 54, 54, 54, 54, 54, 51, 54, 54, 54, 49, 54, 54, 54,...
$ tmin <dbl> 22, 29, 29, 29, 29, 29, 29, 35, 29, 29, 29, 22, 29, 29, 29,...
bk%>%
filter(antihomeless==TRUE) %>%
group_by(arrest_date) %>%
tally() %>%
left_join(wea,by=c('arrest_date'='date')) %>%
ggplot(aes(tmin,n)) +
geom_point() +
geom_smooth(method = 'lm')
bk_wea %>%
filter(year==2002) %>%
group_by(arrest_date) %>%
tally() %>%
left_join(wea,by=c('arrest_date'='date')) %>%
ggplot(aes(tmin,n)) +
geom_point() +
geom_smooth(method = 'lm')
bk %>%
group_by(arrest_date) %>%
tally() %>%
left_join(wea,by=c('arrest_date'='date')) %>%
ggplot(aes(arrest_date,tmin)) +
geom_point() +
geom_point(aes(x=arrest_date,y=n),color='red')